In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
In [5]:
# Generalization bound (VC dimension)
# δ = 4 * (mH(2N)) * e ^ (- epsilon ^ 2 * N / 8)
# epsilon = sqrt((8 * logn(4 * (mH(2N)) / δ)) / N) = Omega(N, H, δ)
# Replacing growth function with the simple approximation bound: N ^ dVC
# δ = 4 * 2N ^ dVC * e ^ (- epsilon ^ 2 * N / 8)
# d_VC * ln(2N) - epsilon ^ 2 * N / 8 - ln(δ/4) = 0
# epsilon = sqrt((8 * logn(4 * N ^ dVC / δ)) / N) = Omega(N, H, δ)
In [305]:
# dVC = 10
# δ = 0.05
# epsilon = 0.05
def vc_bound_eqn(N, delta=0.05, epsilon=0.05, d_VC = 10):
return d_VC * np.log(2*N) - (epsilon ** 2 * N / 8.0) - np.log(delta/4)
# 0.05 = 4 * N ** 10 * e ** (- 0.05 ** 2 * N / 8)
for N in [400000, 420000, 440000, 460000, 480000]:
print "N: {}, value: {}".format(N, vc_bound_eqn(N))
In [7]:
# There are different generalization bounds. They are coded up in functions below
def original_VC_bound(N, d_VC = 50, delta = 0.05):
return np.sqrt((8.0 * d_VC / N) * np.log(4.0 * N / delta))
def original_VC_bound_2(N, d_VC = 50, delta = 0.05):
return np.sqrt((8.0 * d_VC / N) * np.log(8.0 * N / delta))
def rademacher_penalty_bound(N, d_VC = 50, delta = 0.05):
return np.sqrt((2.0 * (d_VC + 1) / N) * np.log(2 * N)) + np.sqrt((2/N) * np.log(1/delta)) + (1/N)
def parrondo_and_van_den_broek(N, d_VC = 50, delta = 0.05):
return (1.0 / N) + np.sqrt((1.0 / N**2) + ((d_VC * np.log(6.0 * N / delta)) / N))
def parrondo_and_van_den_broek_2(N, d_VC = 50, delta = 0.05):
return (1.0 / N) + np.sqrt((1.0 / N**2) + ((d_VC * np.log(12.0 * N / delta)) / N))
def devroye(N, d_VC = 50, delta = 0.05):
return (1.0 / N) - (2.0 / N**2) + np.sqrt((N - 2)**2/N**4 + d_VC * np.log(4.0 * N / delta) / (2 * (N - 2)))
def devroye_2(N, d_VC = 50, delta = 0.05):
return (1.0 / N) - (2.0 / N**2) + np.sqrt((N - 2)**2/N**4 + d_VC * 2.0 * np.log(4.0 * N / delta) / (2 * (N - 2)))
In [8]:
N = np.arange(0, 12000, 100);
plt.plot(N, original_VC_bound(N), label='original vc')
plt.plot(N, original_VC_bound_2(N), label='original vc 2')
plt.plot(N, rademacher_penalty_bound(N), label='rademacher penalty')
plt.plot(N, parrondo_and_van_den_broek(N), label='parrondo and van den broek')
plt.plot(N, parrondo_and_van_den_broek_2(N), label='parrondo and van den broek 2')
plt.plot(N, devroye(N), label='devroye')
plt.plot(N, devroye_2(N), label='devroye 2')
plt.legend(bbox_to_anchor=(1.5, 1.5))
plt.show()
In [9]:
N = np.arange(9000, 11000, 100);
plt.plot(N, original_VC_bound(N), label='original vc')
plt.plot(N, original_VC_bound_2(N), label='original vc 2')
plt.plot(N, rademacher_penalty_bound(N), label='rademacher penalty')
plt.plot(N, parrondo_and_van_den_broek(N), label='parrondo and van den broek')
plt.plot(N, parrondo_and_van_den_broek_2(N), label='parrondo and van den broek 2')
plt.plot(N, devroye(N), label='devroye')
plt.plot(N, devroye_2(N), label='devroye 2')
plt.legend(bbox_to_anchor=(1.5, 1.5))
plt.show()
In [307]:
N = np.arange(4, 8, 1);
plt.plot(N, original_VC_bound(N), label='original vc')
plt.plot(N, original_VC_bound_2(N), label='original vc 2')
plt.plot(N, rademacher_penalty_bound(N), label='rademacher penalty')
plt.plot(N, parrondo_and_van_den_broek(N), label='parrondo and van den broek')
plt.plot(N, parrondo_and_van_den_broek_2(N), label='parrondo and van den broek 2')
plt.plot(N, devroye(N), label='devroye')
plt.plot(N, devroye_2(N), label='devroye 2')
plt.legend(bbox_to_anchor=(1.5, 1.5))
plt.show()
In [11]:
X = np.random.uniform(low=-1.0, high=1.0, size=2)
X = X.T
X
Out[11]:
In [12]:
def f(x):
return np.sin(np.pi * x)
f(X)
Out[12]:
In [13]:
x_min = -1
x_max = 1
y_min = x_min
y_max = x_max
In [14]:
plt.grid(True)
t = np.arange(-1.0, 1.0, .01)
plt.plot(t, np.sin(np.pi * t))
plt.axis([x_min, x_max, y_min, y_max])
plt.scatter(X, f(X))
Out[14]:
In [15]:
from numpy.linalg import inv
import collections
def linear_regression(X, y):
X_T_X = np.dot(X.T, X)
inverse = inv(X_T_X) if isinstance(X_T_X, collections.Sequence) else 1.0/X_T_X
X_dagger = np.dot(inverse, X.T)
weights = np.dot(X_dagger, y)
return weights
In [16]:
linear_regression(X, f(X))
Out[16]:
In [45]:
num_experiments = 1000
a = []
for i in range(num_experiments):
X_exp = np.random.uniform(low=-1.0, high=1.0, size=2)
X_exp = X_exp.T
y_exp = np.sin(np.pi * X_exp)
a.append(linear_regression(X_exp, y_exp))
a_hat = np.mean(a)
In [46]:
print "a_hat is {} after {} experiments (or {} after rounding)".format(a_hat, num_experiments, np.round(a_hat, 2))
In [49]:
plt.axis([x_min, x_max, y_min, y_max])
for ai in a:
plt.plot(p, ai * p, color='lightgrey')
plt.grid(True)
t = np.arange(-1.0, 1.0, .01)
plt.plot(t, np.sin(np.pi * t))
p = np.array([-1, 1])
plt.plot(p, a_hat * p, lw=2)
plt.plot(t, a_hat * t - np.sin(np.pi * t))
plt.plot(t, (a_hat * t - np.sin(np.pi * t))**2)
Out[49]:
In [80]:
x = np.arange(-1.0, 1.0, .01)
bias = np.mean((a_hat * x - np.sin(np.pi * x))**2)
bias = np.round(bias, 2)
print "Bias is {}".format(bias)
In [81]:
variance = np.mean((np.outer(np.array(a), x) - a_hat * x)**2)
variance = np.round(variance, 2)
print "Variance is {}".format(variance)
In [82]:
print "Also, the expected error is bias + variance = {} + {} = {}".format(bias, variance, bias + variance)
In [297]:
def expected_error(transformation):
num_experiments = 10000
step = 0.01
weights = []
for i in range(num_experiments):
X_exp = np.random.uniform(low=-1.0, high=1.0, size=2)
y_exp = np.sin(np.pi * X_exp)
weights.append(linear_regression(transformation(X_exp), y_exp))
weights_hat = np.mean(weights, axis=0)
x = np.arange(-1.0, 1.0, step)
bias = np.mean((np.dot(transformation(x), weights_hat) - np.sin(np.pi * x))**2)
if len(np.shape(weights)) == 1:
A = np.outer(np.array(weights), transformation(x))
else:
A = np.dot(np.array(weights), transformation(x).transpose())
B = np.dot(transformation(x), weights_hat)
variance = np.mean((A - B)**2)
expected_error = bias + variance
plt.axis([x_min, x_max, y_min, y_max])
for w in weights:
plt.plot(x, np.dot(transformation(x), w), color='lightgrey')
plt.grid(True)
plt.plot(x, np.sin(np.pi * x), color='blue')
if len(np.shape(weights)) == 1:
plt.plot(transformation(x), np.sin(np.pi * x), color='black')
else:
print np.shape(transformation(x))
print np.shape(transformation(x)[:,1])
plt.plot(transformation(x)[:,1], np.sin(np.pi * x), color='black')
p = np.array([-1, 1])
plt.plot(x, np.dot(transformation(x), weights_hat), lw=2, color='green')
plt.plot(x, (np.dot(transformation(x), weights_hat) - np.sin(np.pi * x))**2, color='red')
print "weights:{}".format(np.round(weights_hat, 2))
print "Bias: {}, Variance: {}, E(error): {}".format(np.round(bias, 2), np.round(variance, 2), np.round(expected_error, 2))
In [298]:
def only_threshold(x):
return np.ones(np.size(x))
expected_error(only_threshold)
In [299]:
def identity(x):
return x
expected_error(identity)
In [300]:
def with_threshold(x):
return np.array([np.ones(np.size(x)), x]).transpose()
expected_error(with_threshold)
In [301]:
def square(x):
return x**2
expected_error(square)
In [302]:
def square_with_threshold(x):
return np.array([np.ones(np.size(x)), x**2]).transpose()
expected_error(square_with_threshold)
Seems like it would be fair to say that q is the vc dimension. But I don't have either a good proof or a good intuition about this. I'll say the answer to question 8 is c.
If a hypothesis is in the intersection, it means that it is in all sets, so even in the one with the lowest vc dimension. If it could shatter a bigger set, then the vc dimension of the set with the smalled vc dimension would be higher, but that would be a contradiction. So the max is the min vc dimension of all the sets. And min is 0 if the intersection is empty.
I'll say the answer to question 9 is b.
I'll say the answer to question 10 is either d or e. THe upper bound seems loose to me so I will choose the smaller one. So the answer is d.
Homework: https://work.caltech.edu/homework/hw4.pdf
Answers:
Answer key: https://work.caltech.edu/homework/hw4_sol.pdf